smallRNA Differential Expression

Author

Lance Parsons

Load libraries

This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().

install.packages("renv")
renv::restore()

Read sample data

Get list of samples

Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE) %>%
  mutate(
    day = as.factor(paste0("day", day)),
    batch = as.factor(case_when(
      exogenous_rna == "mastermix1" ~ "batch3",
      exogenous_rna == "mastermix2" ~ "batch3",
      TRUE ~ "batch2"
    )),
    .before = cell_line,
  )

sample_units

Table of exogenous RNA mixtures

Code
# Exogenous RNA mixtures
rna_mixes <- tibble()
for (mix in c("mastermix1", "mastermix2", "PJY103_mDNMT1", "PJY300_mDNMT1")) {
  t <- Biostrings::readDNAStringSet(sprintf("data/references/%s.fa", mix))
  rna_mixes <- rbind(rna_mixes, tibble(
    exogenous_rna = mix,
    rna_species = word(t@ranges@NAMES, 1),
    start = 1,
    end = t@ranges@width
  ))
}

rna_mixes <- rna_mixes %>%
  mutate(
    active_start_min = case_when(
      exogenous_rna == "mastermix1" ~ 11,
      exogenous_rna == "mastermix2" ~ 11,
      exogenous_rna == "PJY103_mDNMT1" ~ 11,
      exogenous_rna == "PJY300_mDNMT1" ~ 11
    ),
    active_start_max = case_when(
      exogenous_rna == "mastermix1" ~ 14,
      exogenous_rna == "mastermix2" ~ 14,
      exogenous_rna == "PJY103_mDNMT1" ~ 15,
      exogenous_rna == "PJY300_mDNMT1" ~ 15
    ),
    cyptic_terminator_end = case_when(
      exogenous_rna == "mastermix1" ~ 37,
      exogenous_rna == "mastermix2" ~ 37,
      exogenous_rna == "PJY103_mDNMT1" ~ 38,
      exogenous_rna == "PJY300_mDNMT1" ~ 38
    ),
  )

rna_mixes

Exogenous RNA counts

BAM reading functions

Code
rna_species_plot_range <- function(sequence_name) {
  plot_range <- rna_mixes %>%
    dplyr::filter(.data$rna_species == sequence_name) %>%
    dplyr::select(start, end) %>%
    unique()
  return(plot_range)
}

# GRanges from BAM
granges_from_bam <- function(sample_unit,
                             sequence_name,
                             is_proper_pair = TRUE,
                             bam_dir =
                               "results/alignments/exogenous_rna/sorted") {
  plot_range <- rna_species_plot_range(sequence_name)
  which <- GRanges(
    sprintf("%s:%i-%i", sequence_name, plot_range$start, plot_range$end)
  )

  param <- ScanBamParam(
    flag = scanBamFlag(isProperPair = is_proper_pair),
    mapqFilter = 1,
    which = which
  )

  aligned_fragments_list <- list()

  if (is_proper_pair) {
    aligned_fragments <- granges(
      readGAlignmentPairs(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      ),
      on.discordant.seqnames = "drop"
    )
    rna_info <- rna_mixes %>%
      dplyr::filter(rna_species == {{ sequence_name }}) %>%
      dplyr::select(-"exogenous_rna") %>%
      unique()

    aligned_fragments_list$active <- aligned_fragments %>%
      plyranges::filter(
        start <= rna_info$active_start_max,
        end >= rna_info$cyptic_terminator_end
      )
    aligned_fragments_list$inactive <- aligned_fragments %>%
      plyranges::filter(
        start > rna_info$active_start_max,
        end >= rna_info$cyptic_terminator_end
      )
    aligned_fragments_list$premature_termination <- aligned_fragments %>%
      plyranges::filter(end < rna_info$cyptic_terminator_end)
  } else {
    aligned_fragments <- granges(
      readGAlignments(
        sprintf("%s/%s.bam", bam_dir, sample_unit),
        param = param
      )
    )
    aligned_fragments_list$discordant <- aligned_fragments
  }
  return(aligned_fragments_list)
}

Read BAM coverage

Code
rna_species_plot_data <- list()
for (rna_species_to_plot in rna_mixes$rna_species) {
  rna_info <- rna_mixes %>%
    dplyr::filter(rna_species == rna_species_to_plot)

  sample_units_to_plot <- sample_units %>%
    dplyr::filter(exogenous_rna %in% rna_info$exogenous_rna)

  granges_to_plot <- list()
  for (sample_unit in sample_units_to_plot$sample_unit) {
    granges_to_plot[[sample_unit]] <- granges_from_bam(
      sample_unit,
      rna_species_to_plot,
      TRUE
    )
  }
  rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}

Organize coverage data

Code
exogenous_rna_count_data <- tibble(
  sample_unit = character(),
  sequence_name = character(),
  category = character(),
  count = numeric()
)
for (rna_species in names(rna_species_plot_data)) {
  for (sample_unit in names(rna_species_plot_data[[rna_species]])) {
    for (category in
      names(rna_species_plot_data[[rna_species]][[sample_unit]])) {
      exogenous_rna_count_data <- exogenous_rna_count_data %>%
        add_row(
          sample_unit = sample_unit,
          sequence_name = rna_species,
          category = category,
          count = length(
            rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
          )
        )
    }
  }
}
exogenous_rna_count_data

Summarize coverage data

Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
  group_by(sample_unit, sequence_name) %>%
  summarize(mapped_fragments = sum(count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals

small RNA human gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
human_gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

human_gene_counts <- readr::read_tsv(
  human_gene_counts_files[1],
  comment = "#",
  col_names = c("gene", human_gene_counts_files[1]),
  col_types = "ci"
)

for (i in 2:length(human_gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      human_gene_counts_files[i],
      comment = "#",
      col_names = c("gene", human_gene_counts_files[i]),
      col_types = "ci"
    )
  human_gene_counts <- human_gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

human_gene_counts <- human_gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

exogenous_gene_counts <- exogenous_rna_count_data %>%
  tidyr::unite(gene, c(sequence_name, category), sep = ":") %>%
  tidyr::spread(sample_unit, count)

gene_counts <- rbind(human_gene_counts, exogenous_gene_counts)

gene_counts

Exogenous RNA “gene” names

Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
  dplyr::filter(str_detect(gene, "^PJY")) %>%
  pull(gene)
exogenous_rna_names
 [1] "PJY103_mDNMT1:active"                "PJY103_mDNMT1:inactive"             
 [3] "PJY103_mDNMT1:premature_termination" "PJY177_RNF2:active"                 
 [5] "PJY177_RNF2:inactive"                "PJY177_RNF2:premature_termination"  
 [7] "PJY179_FANCF:active"                 "PJY179_FANCF:inactive"              
 [9] "PJY179_FANCF:premature_termination"  "PJY181_HEK3:active"                 
[11] "PJY181_HEK3:inactive"                "PJY181_HEK3:premature_termination"  
[13] "PJY182_HEK3:active"                  "PJY182_HEK3:inactive"               
[15] "PJY182_HEK3:premature_termination"   "PJY183_DNMT1:active"                
[17] "PJY183_DNMT1:inactive"               "PJY183_DNMT1:premature_termination" 
[19] "PJY184_DNMT1:active"                 "PJY184_DNMT1:inactive"              
[21] "PJY184_DNMT1:premature_termination"  "PJY185_RUNX1:active"                
[23] "PJY185_RUNX1:inactive"               "PJY185_RUNX1:premature_termination" 
[25] "PJY186_RUNX1:active"                 "PJY186_RUNX1:inactive"              
[27] "PJY186_RUNX1:premature_termination"  "PJY187_VEGFA:active"                
[29] "PJY187_VEGFA:inactive"               "PJY187_VEGFA:premature_termination" 
[31] "PJY300_mDNMT1:active"                "PJY300_mDNMT1:inactive"             
[33] "PJY300_mDNMT1:premature_termination" "PJY302_EMX1:active"                 
[35] "PJY302_EMX1:inactive"                "PJY302_EMX1:premature_termination"  
[37] "PJY306_EMX1:active"                  "PJY306_EMX1:inactive"               
[39] "PJY306_EMX1:premature_termination"  

Import sample metadata and counts into DESeq2

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = gene_counts %>%
    tibble::column_to_rownames("gene") %>%
    replace(is.na(.), 0),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
converting counts to integer mode
Code
dds
class: DESeqDataSet 
dim: 56726 56 
metadata(1): version
assays(1): counts
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
  PJY306_EMX1:premature_termination
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(9): sample_unit sample_name ... fq1 fq2

Sample comparisons

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  vsd$day,
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differential Expression

Batch 2 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day1 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day1")
dds_batch2_day1$exogenous_rna <- droplevels(dds_batch2_day1$exogenous_rna)
dds_batch2_day1$day <- droplevels(dds_batch2_day1$day)
design(dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq(dds_batch2_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day1
class: DESeqDataSet 
dim: 56726 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
  PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
  P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
  Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results(dds_batch2_day1, alpha = 0.05)
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56726 rows and 6 columns
                                   baseMean log2FoldChange     lfcSE      stat
                                  <numeric>      <numeric> <numeric> <numeric>
GAS5                                3358381      0.1558718 0.0335418  4.647095
SNORD30                             1605167      0.0910242 0.0576592  1.578660
SNORD49A                            1156164      0.1842127 0.0436446  4.220748
SNORD26                              857547      0.0954226 0.0539986  1.767132
SNHG1                                780897     -0.0217632 0.0400343 -0.543614
...                                     ...            ...       ...       ...
PJY302_EMX1:inactive                      0             NA        NA        NA
PJY302_EMX1:premature_termination         0             NA        NA        NA
PJY306_EMX1:active                        0             NA        NA        NA
PJY306_EMX1:inactive                      0             NA        NA        NA
PJY306_EMX1:premature_termination         0             NA        NA        NA
                                       pvalue        padj
                                    <numeric>   <numeric>
GAS5                              3.36642e-06 7.69448e-05
SNORD30                           1.14414e-01 4.51749e-01
SNORD49A                          2.43493e-05 4.79464e-04
SNORD26                           7.72060e-02 3.61519e-01
SNHG1                             5.86707e-01 8.79696e-01
...                                       ...         ...
PJY302_EMX1:inactive                       NA          NA
PJY302_EMX1:premature_termination          NA          NA
PJY306_EMX1:active                         NA          NA
PJY306_EMX1:inactive                       NA          NA
PJY306_EMX1:premature_termination          NA          NA
Code
summary(res_batch2_day1)

out of 51621 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 937, 1.8%
LFC < 0 (down)     : 1749, 3.4%
outliers [1]       : 1, 0.0019%
low counts [2]     : 23895, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day1_lfc <- lfcShrink(dds_batch2_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv")

res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 5105 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day1_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 2 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch2_day2 <- subset(dds, select = colData(dds)$batch == "batch2" &
  colData(dds)$day == "day2")
dds_batch2_day2$exogenous_rna <- droplevels(dds_batch2_day2$exogenous_rna)
dds_batch2_day2$day <- droplevels(dds_batch2_day2$day)
design(dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq(dds_batch2_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch2_day2
class: DESeqDataSet 
dim: 56726 16 
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
  PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
  P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
  Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results(dds_batch2_day2, alpha = 0.05)
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56726 rows and 6 columns
                                   baseMean log2FoldChange     lfcSE      stat
                                  <numeric>      <numeric> <numeric> <numeric>
GAS5                                3192415     0.18567762 0.0302890  6.130190
SNORD30                             1514330     0.15582664 0.0511819  3.044562
SNORD49A                            1170290     0.19302856 0.0427795  4.512171
SNORD26                              825436     0.09512922 0.0397968  2.390377
SNHG1                                678111     0.00437697 0.0311162  0.140665
...                                     ...            ...       ...       ...
PJY302_EMX1:inactive                      0             NA        NA        NA
PJY302_EMX1:premature_termination         0             NA        NA        NA
PJY306_EMX1:active                        0             NA        NA        NA
PJY306_EMX1:inactive                      0             NA        NA        NA
PJY306_EMX1:premature_termination         0             NA        NA        NA
                                       pvalue        padj
                                    <numeric>   <numeric>
GAS5                              8.77739e-10 3.21830e-08
SNORD30                           2.33019e-03 2.78025e-02
SNORD49A                          6.41674e-06 1.40180e-04
SNORD26                           1.68311e-02 1.34207e-01
SNHG1                             8.88134e-01 9.77382e-01
...                                       ...         ...
PJY302_EMX1:inactive                       NA          NA
PJY302_EMX1:premature_termination          NA          NA
PJY306_EMX1:active                         NA          NA
PJY306_EMX1:inactive                       NA          NA
PJY306_EMX1:premature_termination          NA          NA
Code
summary(res_batch2_day2)

out of 50454 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 875, 1.7%
LFC < 0 (down)     : 1593, 3.2%
outliers [1]       : 53, 0.11%
low counts [2]     : 24295, 48%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch2_day2_lfc <- lfcShrink(dds_batch2_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv")

res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch2_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch2_day2_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6272 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch2_day2_lfc_labelled$gene) {
  gene_prefix <- str_split(gene, "_")[[1]][1]
  d <- plotCounts(dds_batch2_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(gene_prefix, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list)

Batch 3 - Day 1

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day1 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day1")
dds_batch3_day1$exogenous_rna <- droplevels(dds_batch3_day1$exogenous_rna)
dds_batch3_day1$day <- droplevels(dds_batch3_day1$day)
design(dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq(dds_batch3_day1, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day1
class: DESeqDataSet 
dim: 56726 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
  PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
  P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results(dds_batch3_day1, alpha = 0.05)
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56726 rows and 6 columns
                                   baseMean log2FoldChange     lfcSE      stat
                                  <numeric>      <numeric> <numeric> <numeric>
GAS5                                3961072      0.2446362 0.0339371  7.208511
SNORD30                             1821940      0.2591363 0.0384223  6.744423
SNORD49A                            1512020      0.2499211 0.0373335  6.694278
SNORD26                              978999      0.2163275 0.0334244  6.472148
SNHG1                                822970      0.0132365 0.0371975  0.355845
...                                     ...            ...       ...       ...
PJY302_EMX1:inactive               5291.362      -1.733848 0.0911872 -19.01417
PJY302_EMX1:premature_termination   678.258      -0.942775 0.0920436 -10.24270
PJY306_EMX1:active                 3244.590       0.822682 0.0699691  11.75778
PJY306_EMX1:inactive                557.327      -0.116098 0.0758092  -1.53145
PJY306_EMX1:premature_termination   101.022      -1.211226 0.1573306  -7.69861
                                       pvalue        padj
                                    <numeric>   <numeric>
GAS5                              5.65670e-13 2.11559e-11
SNORD30                           1.53637e-11 5.08829e-10
SNORD49A                          2.16740e-11 7.07169e-10
SNORD26                           9.66192e-11 2.93474e-09
SNHG1                             7.21957e-01 9.28340e-01
...                                       ...         ...
PJY302_EMX1:inactive              1.30172e-80 4.14871e-78
PJY302_EMX1:premature_termination 1.27619e-24 1.03560e-22
PJY306_EMX1:active                6.44051e-32 6.68082e-30
PJY306_EMX1:inactive              1.25658e-01 4.63556e-01
PJY306_EMX1:premature_termination 1.37558e-14 5.81737e-13
Code
summary(res_batch3_day1)

out of 49768 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 862, 1.7%
LFC < 0 (down)     : 1733, 3.5%
outliers [1]       : 1, 0.002%
low counts [2]     : 27776, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day1_lfc <- lfcShrink(dds_batch3_day1,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv")

res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day1_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, colour = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day1_lfc_labelled,
    aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6958 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day1_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day1,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 3)

Batch 3 - Day 2

Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds_batch3_day2 <- subset(dds, select = colData(dds)$batch == "batch3" &
  colData(dds)$day == "day2")
dds_batch3_day2$exogenous_rna <- droplevels(dds_batch3_day2$exogenous_rna)
dds_batch3_day2$day <- droplevels(dds_batch3_day2$day)
design(dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq(dds_batch3_day2, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
dds_batch3_day2
class: DESeqDataSet 
dim: 56726 12 
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
  PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
  Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
  P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results(dds_batch3_day2, alpha = 0.05)
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56726 rows and 6 columns
                                   baseMean log2FoldChange     lfcSE       stat
                                  <numeric>      <numeric> <numeric>  <numeric>
GAS5                                4358510       0.366504 0.0344499  10.638768
SNORD30                             2397541       0.329002 0.0411267   7.999713
SNORD49A                            1753252       0.415111 0.0480745   8.634753
SNORD26                             1116920       0.229311 0.0408223   5.617289
SNHG1                                818061       0.037861 0.0420172   0.901082
...                                     ...            ...       ...        ...
PJY302_EMX1:inactive               7483.494     -1.4442936 0.1202005 -12.015708
PJY302_EMX1:premature_termination   610.148      0.0215212 0.0909814   0.236545
PJY306_EMX1:active                 5055.971      0.8342056 0.0894112   9.329987
PJY306_EMX1:inactive                673.181      0.0215212 0.0893793   0.240785
PJY306_EMX1:premature_termination    88.067     -0.4689319 0.1847407  -2.538325
                                       pvalue        padj
                                    <numeric>   <numeric>
GAS5                              1.96712e-26 1.72482e-24
SNORD30                           1.24710e-15 5.66381e-14
SNORD49A                          5.88542e-18 3.19903e-16
SNORD26                           1.93976e-08 4.78468e-07
SNHG1                             3.67545e-01 7.16626e-01
...                                       ...         ...
PJY302_EMX1:inactive              2.93843e-33 3.44370e-31
PJY302_EMX1:premature_termination 8.13010e-01 9.48110e-01
PJY306_EMX1:active                1.05884e-20 7.06626e-19
PJY306_EMX1:inactive              8.09722e-01 9.47014e-01
PJY306_EMX1:premature_termination 1.11385e-02 7.40867e-02
Code
summary(res_batch3_day2)

out of 50043 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 1167, 2.3%
LFC < 0 (down)     : 2127, 4.3%
outliers [1]       : 6, 0.012%
low counts [2]     : 26012, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

Code
res_batch3_day2_lfc <- lfcShrink(dds_batch3_day2,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot (lfc shrunk)

MA Plot of shrunken log2 fold changes

Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
  as.data.frame() %>%
  rownames_to_column("gene") %>%
  dplyr::mutate(significant = padj < 0.05)

write_tsv(res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv")

res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
  dplyr::filter(gene %in% exogenous_rna_names) %>%
  na.omit()

ggplot(
  res_batch3_day2_lfc_df,
  aes(x = log2(baseMean), y = log2FoldChange, color = significant)
) +
  geom_point() +
  ggrepel::geom_label_repel(
    data = res_batch3_day2_lfc_labelled,
    mapping = aes(label = gene, segment.colour = "black"),
    min.segment.length = 0,
    max.overlaps = 30,
    size = 2.5,
    show.legend = FALSE
  ) +
  theme_bw()
Warning: Removed 6683 rows containing missing values (geom_point).

Counts of exogenous rna

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plot_list <- list()
for (gene in res_batch3_day2_lfc_labelled$gene) {
  gene_rna_type <- str_split(gene, ":")[[1]][1]
  exogenous_rna <- rna_mixes %>%
    dplyr::filter(rna_species == gene_rna_type) %>%
    pull(exogenous_rna)
  exogenous_rna_regex <- paste(exogenous_rna, collapse = "|")

  d <- plotCounts(dds_batch3_day2,
    gene = gene,
    intgroup = "cell_line",
    returnData = TRUE
  )
  p <- ggplot(
    d %>%
      rownames_to_column("sample_unit") %>%
      dplyr::filter(grepl(exogenous_rna_regex, sample_unit)),
    aes(x = cell_line, y = count)
  ) +
    geom_point(position = position_jitter(width = 0.1, height = 0)) +
    ggtitle(gene)
  plot_list[[gene]] <- p
}
cowplot::plot_grid(plotlist = plot_list, ncol = 3)